/* ********************************************************************************* */
/* 0 ************************* Compustat dataset          ************************** */
/* ********************************************************************************* */
/* Summary   : Code builds an unbalanced firm year panel with annual financial data  */
/* ********************************************************************************* */

	libname data 'C:\Data\';

	
/*Extract data from Compustat FUNDA master file, downloaded from /wrdslin/comp/sasdata/naa */
/*Calculate firm's financial variables, not all are used in the paper */

	data compfunda; set data.funda;

	gvkeyN=gvkey*1;
	fyenddt = datadate;	format fyenddt date9.;
	l_GVKEY = lag(GVKEY);

	WC = RECT + INVT + ACO - LCT;

	ROA = OIBDP / lag(AT); 
	ROE = NI/lag(CEQ);
	FCF = OIBDP - TXT - CAPX; 
	lag_WCAP=lag(WCAP);	
	OCF = SALE - COGS - XSGA - (WCAP - lag_WCAP);
	salegr = SALE / lag(SALE) -1;
	lag_ppent=lag(ppent);
	stockreturn=PRCC_F/lag(PRCC_F)-1;

	if l_GVKEY ~= GVKEY then do; ROA = .; ROE = .; FCF = .; OCF = .; salegr = .; lag_ppent=.; stockreturn=.; end;

	adv=XAD;
	liq = (ACT - LCT) / AT;
	pe = PRCC_F / EPSPX;
	EBITDA=OIBDP;
	sales=SALE;
	cash = CHE;
	ppe=PPENT;
	sellexp=XSGA;
	if XRD>0 then do; rd=XRD; D_rd=1; end; else do; rd=0; D_rd=0; end;
	shrout = CSHO;
	mktcap = PRCC_F*CSHO; 
	assets = AT;
	equity = CEQ;
	assetsm = assets - equity + mktcap;
	ttldebt = DLTT + DLC;
	NI=NI;
	Q = assetsm / assets;
	MB = mktcap / equity;
	LVB = ttldebt / assets;
	LVM= ttldebt / assetsm;
	cashassets=cash/ assets;
	FCFassets=FCF / assets;
	OCFassets=OCF / assets;
	ltdebt=DLTT;
	
	keep naicsh sich id GVkeyN cusip fyenddt fyear cik  tic conm
		 intexp adv sales cash shrout mktcap assets equity assetsm ttldebt ltdebt salegr pe liq  
         Q MB LVB LVM OCF FCF ROA ROE NDI PPE COGS SellExp RD D_RD CAPX EBITDA NI GDWLIP GDWL OCFassets FCFassets cashassets
		 INVT INTAN	ACT OIBDP DVC RE WCAP DLTIS SSTK DLTR PRSTKC XINT AT
         tang liquid stockreturn ipodate lag_ppent prcc_f csho ceq dltt dlc seq dvc dvp che
		 ;
	run; 

* get first CRSP day from CRSP monthly stock return file ; 
* available from WRDS CRSP 'Monthly Stock - Securities' file, downloaded from /wrdslin/crsp/sasdata/m_stock */;
	data temp;set data.msf;
	keep cusip date; run;
	proc sort;by cusip date; run;
	data data.dateinc;set temp; by cusip;if first.cusip;rename date=dateinc;run;

	data dateinc; set data.dateinc; cusip6=substr(cusip,1,6); keep cusip6 dateinc; proc sort; by cusip6 dateinc; run;
	data dateinc; set dateinc; by cusip6; if first.cusip6; run;

	proc sql; create table comp1 as select * from compfunda as a left join Dateinc as b 
	on substr(a.cusip,1,6) = b.cusip6; quit; 
	proc sort data=comp1; by id descending dateinc; run;
	data comp; set comp1; by id; if first.id; rename GVkeyN=gvkey;
	if ipodate=. then ipodate=dateinc;
	firmage=(fyenddt-ipodate)/365;	
	run;
	
* download CRSP/Compustat merged header file from WRDS CRSP/Compustat Merged - Fundamentals Annual;
* https://wrds-www.wharton.upenn.edu/pages/get-data/center-research-security-prices-crsp/annual-update/crspcompustat-merged/fundamentals-annual/ ; 

	data sic; set data.headercrspcomp; gvkeyN=gvkey*1; keep GVkeyN fyear sic lpermno cusip; run;
	proc sql; create table comp as select * from comp as a left join sic as b on a.gvkey = b.gvkeyN and a.fyear=b.fyear; quit; 
	proc sort data=comp; by id; run;
	data comp; set comp; by id; if first.id; run;

	* fill up missing SIC;
	data sic1; set data.headercrspcomp; gvkeyN=gvkey*1; sic1=sic; keep GVkeyN sic1; run; proc sort nodupkey; by gvkeyn; run;
	proc sql; create table comp as select * from comp as a left join sic1 as b on a.gvkey = b.gvkeyN; quit; 
	proc sort data=comp; by id; run;
	data comp; set comp; by id; if first.id; if sic="" then sic=sic1; rename gvkey=gvkey; drop sic1; run;
	data comp; set comp; sicN=sic*1; drop sic; run; data comp; set comp; rename sicN=sic; run;

	* define FF 48 industry classification;	
	data comp; set comp; 

	if (sic>=100 & sic<=799)or sic=2048 then ind_name='agric';
	else if (sic>=2000 & sic<=2046) or (sic>=2050 & sic<=2063) or (sic>=2070 & sic<=2079)
		or (sic>=2090 & sic<=2095) or (sic>=2098 & sic<=2099) then ind_name='food';
	else if (sic>=2064 & sic<=2068) or (sic>=2086 & sic<=2087) or (sic>=2096 & sic<=2097)
		then ind_name='sod';
	else if (sic>=2080 & sic<=2085) then ind_name='beer';
	else if (sic>=2100 & sic<=2199)then ind_name='smoke';
	else if (sic>=900 & sic<=999) or (sic>=3650 & sic<=3652) or (sic>=3732 & sic<=3732)
		or (sic>=3930 & sic<=3949) then ind_name='toys';
	else if (sic>=7800 & sic<=7841) or (sic>=7900 & sic<=7999) then ind_name='fun';
	else if (sic>=2700 & sic<=2749) or (sic>=2770 & sic<=2799) then ind_name='books';
	else if (sic>=2047 & sic<=2047) or (sic>=2391 & sic<=2392) or (sic>=2510 & sic<=2519)
		or (sic>=2590 & sic<=2599) or (sic>=2840& sic<=2844) or (sic>=3160 & sic<=3199) 
		or (sic>=3229 & sic<=3231) or (sic>=3260 & sic<=3260)
		or (sic>=3262 & sic<=3263) or (sic>=3269 & sic<=3269) 
		or (sic>=3630 & sic<=3639) or (sic>=3750 & sic<=3751) or (sic>=3800 & sic<=3800)
		or (sic>=3860 & sic<=3879) or (sic>=3910 & sic<=3919) 
		or (sic>=3960 & sic<=3961) or (sic>=3991 & sic<=3991) or (sic>=3995 & sic<=3995)
		then ind_name='hshld';
	else if (sic>=2300 & sic<=2390) or (sic>=3020 & sic<=3021) or (sic>=3100 & sic<=3111)
		or (sic>=3130 & sic<=3159) or (sic>=3965 & sic<=3965) then ind_name='clths';
	else if (sic>=8000 & sic<=8099) then ind_name='hlth';
	else if (sic>=3693 & sic<=3693) or (sic>=3840 & sic<=3851) then ind_name='medeq';
	else if (sic>=2830 & sic<=2836) then ind_name='drugs';
	else if (sic>=2800 & sic<=2829) or (sic>=2850 & sic<=2899)then ind_name='chems';
	else if (sic>=3000 & sic<=3000) or (sic>=3050 & sic<=3099)then ind_name='rubbr';
	else if (sic>=2200 & sic<=2295) or (sic>=2297 & sic<=2299) or (sic>=2393 & sic<=2395)
		or (sic>=2397 & sic<=2399) then ind_name='txtls';
	else if (sic>=800 & sic<=899) or (sic>=2400 & sic<=2439) or (sic>=2450 & sic<=2459)
		or (sic>=2490 & sic<=2499) or (sic>=2950 & sic<=2952) 
		or (sic>=3200 & sic<=3219) or (sic>=3240 & sic<=3259) or (sic>=3261 & sic<=3261)
		or (sic>=3264 & sic<=3264) or (sic>=3270 & sic<=3299) 
		or (sic>=3420 & sic<=3442) or (sic>=3446 & sic<=3452) or (sic>=3490 & sic<=3499)
		or (sic>=3996 & sic<=3996) then ind_name='bldmt';
	else if (1500<=sic and sic<=1549) or (1600<=sic and sic<=1699) or ( 1700<=sic and sic<=1799) then ind_name='cnstr';
	else if (3300<=sic and sic<=3369) or (3390<=sic and sic<=3399) then ind_name='steel';
	else if (sic=3400) or (3443<=sic and sic<=3444) or (3460<=sic and sic<=3479) then ind_name='fabpr';
	else if (3510<=sic and sic<=3536) or (3540<=sic and sic<=3569) or (3580<=sic and sic<=3599) then ind_name='mach';
	else if (3600<=sic and sic<=3621) or (3623<=sic and sic<=3629) or (3640<=sic and sic<=3646) or
			(3648<=sic and sic<=3649) or (3660<=sic and sic<=3660) or (3691<=sic and sic<=3692) or
			(3699<=sic and sic<=3699) then ind_name='elceq';
	else if (2296<=sic and sic<=2296) or (2396<=sic and sic<=2396) or (3010<=sic and sic<=3011) or
			(3537<=sic and sic<=3537) or (3647<=sic and sic<=3647) or (3694<=sic and sic<=3694) or
			(3700<=sic and sic<=3716) or (3790<=sic and sic<=3792) or (3799<=sic and sic<=3799) then ind_name='auto';
	else if (3720<=sic and sic<=3729) then ind_name='aero';
	else if (3730<=sic and sic<=3731) or (3740<=sic and sic<=3743) then ind_name='ships';
	else if (3480<=sic and sic<=3489) or (3760<=sic and sic<=3769) or (3795<=sic and sic<=3795) then ind_name='guns';
	else if (1040<=sic and sic<=1049) then ind_name='gold';
	else if (1000<=sic and sic<=1039) or (1060<=sic and sic<=1099) or (1400<=sic and sic<=1499) then ind_name='mines';
	else if (1200<=sic and sic<=1299) then ind_name='coal';
	else if (1310<=sic and sic<=1389) or (2900<=sic and sic<=2911) or (2990<=sic and sic<=2999) then ind_name='energ';
	else if (4900<=sic and sic<=4999) then ind_name='util';
	else if (4800<=sic and sic<=4899) then ind_name='telem';
	else if (7020<=sic and sic<=7021) or (7030<=sic and sic<=7039) or (7200<=sic and sic<=7212) or
			(7215<=sic and sic<=7299) or (7395<=sic and sic<=7395) or (7500<=sic and sic<=7500) or
			(7520<=sic and sic<=7549) or (7600<=sic and sic<=7699) or (8100<=sic and sic<=8199) or
			(8200<=sic and sic<=8299) or (8300<=sic and sic<=8399) or (8400<=sic and sic<=8499) or
			(8600<=sic and sic<=8699) or (8800<=sic and sic<=8899) then ind_name='persv';
	else if (2750<=sic and sic<=2759) or (3993<=sic and sic<=3993) or (7300<=sic and sic<=7372) or
			(7374<=sic and sic<=7394) or (7397<=sic and sic<=7397) or (7399<=sic and sic<=7399) or
			(7510<=sic and sic<=7519) or (8700<=sic and sic<=8748) or (8900<=sic and sic<=8999) then ind_name='bussv';
	else if (3570<=sic and sic<=3579) or (3680<=sic and sic<=3689) or (3695<=sic and sic<=3695) or
			(7373<=sic and sic<=7373) then ind_name='comps';
	else if (3622<=sic and sic<=3622) or (3661<=sic and sic<=3679) or (3810<=sic and sic<=3810) or
			(3812<=sic and sic<=3812) then ind_name='chips';
	else if (3811<=sic and sic<=3811) or (3820<=sic and sic<=3830) then ind_name='labeq';
	else if (2520<=sic and sic<=2549) or (2600<=sic and sic<=2639) or (2670<=sic and sic<=2699) or
			(2760<=sic and sic<=2761) or (3950<=sic and sic<=3955) then ind_name='paper';
	else if (2440<=sic and sic<=2449) or (2640<=sic and sic<=2659) or (3210<=sic and sic<=3221) or
			(3410<=sic and sic<=3412) then ind_name='boxes';
	else if (4000<=sic and sic<=4099) or (4100<=sic and sic<=4199) or (4200<=sic and sic<=4299) or
			(4400<=sic and sic<=4499) or (4500<=sic and sic<=4599) or (4600<=sic and sic<=4699) or
			(4700<=sic and sic<=4799) then ind_name='trans';
	else if (5000<=sic and sic<=5099) or (5100<=sic and sic<=5199) then ind_name='whlsl';
	else if (5200<=sic and sic<=5299) or (5300<=sic and sic<=5399) or (5400<=sic and sic<=5499) or
			(5500<=sic and sic<=5599) or (5600<=sic and sic<=5699) or (5700<=sic and sic<=5736) or
			(5900<=sic and sic<=5999) then ind_name='rtail';
	else if (5800<=sic and sic<=5813) or (5890<=sic and sic<=5890) or (7000<=sic and sic<=7019) or
			(7040<=sic and sic<=7049) or (7213<=sic and sic<=7213) then ind_name='meals';
	else if (6000<=sic and sic<=6099) or (6100<=sic and sic<=6199) then ind_name='banks';
	else if (6300<=sic and sic<=6399) or (6400<=sic and sic<=6411) then ind_name='insur';
 	else if (6500<=sic and sic<=6553) then ind_name='riest';
	else if (6200<=sic and sic<=6299) or (6700<=sic and sic<=6799) then ind_name='fin';
	else ind_name='other';

	length indname12 $60 ; length indname12 $60; 
	if 0100<=sic<=0999 or 2000<=sic<=2399 or 2700<=sic<=2749 or 2770<=sic<=2799 or 3100<=sic<=3199 or 3940<=sic<=3989 then indname12='01. Nondurable consumer goods'; 
	else if 2500<=sic<=2519 or 2590<=sic<=2599 or 3630<=sic<=3659 or 3710<=sic<=3711 or 3714<=sic<=3714 or 3716<=sic<=3716 or 3750<=sic<=3751 or 3792<=sic<=3792 or 3900<=sic<=3939 or 3990<=sic<=3999 then indname12='02. Durable consumer goods'; 
	else if 2520<=sic<=2589 or 2600<=sic<=2699 or 2750<=sic<=2769 or 3000<=sic<=3099 or 3200<=sic<=3569 or 3580<=sic<=3629 or 3700<=sic<=3709 or 3712<=sic<=3713 or 3715<=sic<=3715 or 3717<=sic<=3749 or 3752<=sic<=3791 or 3793<=sic<=3799 or 3830<=sic<=3839 or 3860<=sic<=3899 then indname12='03. Manufacturing'; 
	else if 1200<=sic<=1399 or 2900<=sic<=2999 then indname12='04. Energy'; 
	else if 2800<=sic<=2829 or 2840<=sic<=2899 then indname12='05. Chemical'; 
	else if 3570<=sic<=3579 or 3660<=sic<=3692 or 3694<=sic<=3699 or 3810<=sic<=3829 or 7370<=sic<=7379 then indname12='06. Business equipment'; 
	else if 4800<=sic<=4899 then indname12='07. Telecommunication'; 
	else if 4900<=sic<=4949 then indname12='08. Utilities'; 
	else if 5000<=sic<=5999 or 7200<=sic<=7299 or 7600<=sic<=7699 then indname12='09. Shops'; 
	else if 2830<=sic<=2839 or 3693<=sic<=3693 or 3840<=sic<=3859 or 8000<=sic<=8099 then indname12='10. Health'; 
	else if 6000<=sic<=6999 then indname12='11. Finance'; 
	else indname12='12. Other'; 

	run;

	* get industry adjusted values for FF48 and 2-digit-SIC;
	data comp_ind; set comp; keep fyear ind_name ROA ROE Q MB lvm cashassets; run;
	proc sort data=comp_ind; by fyear ind_name; run;
	proc univariate data=comp_ind noprint; by fyear ind_name; var ROA ROE Q MB lvm cashassets;
	output out=indmedian median=indROA indROE indQ indMB indlvm indcashassets; run;

	proc sql; create table comp1 as select * from comp as a left join indmedian as b on a.fyear = b.fyear and a.ind_name=b.ind_name; quit; 

	data comp_indsic; set comp; sic2=floor(sic/100); keep fyear sic2 ROA ROE Q MB lvm cashassets; run;
	proc sort data=comp_indsic; by fyear sic2; run;
	proc univariate data=comp_indsic noprint; by fyear sic2; var ROA ROE Q MB lvm cashassets;
	output out=indmediansic median=indROAsic indROEsic indQsic indMBsic indlvmsic indcashassetssic; run;

	proc sql; create table comp2 as select * from comp1 as a left join indmediansic as b on a.fyear = b.fyear and floor(a.sic/100)=b.sic2; quit; 

	data data.comp; set comp2; 
	ROA_adj=ROA-indROA; ROE_adj=ROE-indROE; Q_adj=Q-indQ; MB_adj=MB-indMB; lvm_adj=lvm-indlvm; cashassets_adj=cashassets-indcashassets;
	ROA_adjsic=ROA-indROAsic; ROE_adjsic=ROE-indROEsic; Q_adjsic=Q-indQsic; MB_adjsic=MB-indMBsic; lvm_adjsic=lvm-indlvmsic; cashassets_adjsic=cashassets-indcashassetssic;
	drop indROA indROE indQ indMB indlvm indcashassets indROAsic indROEsic indQsic indMBsic indlvmsic indcashassetssic; if fyear=. then delete; run; 
	proc sort data=data.comp nodup; by gvkey fyear; run;

* add prefix a_ to comp;
	%macro prefixvars(inpdsn,prefix,outdsn,excludevars=); %let num=1;	%let excludevar=%scan(%upcase(&excludevars),&num,' ');
	%let excludevar&num=&excludevar; %do %while(&excludevar ne ); %let num=%eval(&num + 1);	%let excludevar=%scan(&excludevars,&num,' ');
	%let excludevar&num=&excludevar; %end;	%let numkeyvars=%eval(&num - 1); 	%let dsid=%sysfunc(open(&inpdsn)); 	%let numvars=%sysfunc(attrn(&dsid,nvars)); 	
	data &outdsn;	set &inpdsn(rename=(%do i = 1 %to &numvars;	%let flag=N;	%let var&i=%sysfunc(varname(&dsid,&i));	%do j=1 %to &numkeyvars;
	%if %upcase(&&var&i) eq &&excludevar&j %then %let flag=Y;	%end;	%if &flag eq N %then %do; &&var&i=&prefix&&var&i %end;	%end;));
	%let rc=%sysfunc(close(&dsid));	run;	%mend prefixvars;

	%prefixvars(data.comp,a_,a_comp,excludevars=)
	%prefixvars(data.comp,t_,t_comp,excludevars=)
